library(prettydoc)
library(arsenal)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(here)
## here() starts at /Users/stephcopeland/Documents/GitHub/222 Final Project/last take
library(tidyr)
library(DescTools)
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:arsenal':
##
## %nin%, N
library(tibble)
library(calecopal)
library(ggeffects)
library(gt)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(modelr)
##
## Attaching package: 'modelr'
## The following object is masked from 'package:ggeffects':
##
## data_grid
library(gtsummary)
data1 <- read.csv("last_take.csv")
head(data1)
## country code area_sqkm population GDP_capita dalys
## 1 Afghanistan AFG 652860 36296111 519.8889 2.544699e+05
## 2 Angola AGO 1246700 29816769 4095.8101 8.006665e+05
## 3 Albania ALB 27400 2873457 4531.0208 1.543584e+02
## 4 Andorra AND 470 76997 38964.9045 3.942648e-01
## 5 Argentina ARG 2736690 44044811 14613.0418 2.894639e+04
## 6 Armenia ARM 28470 2944789 3914.5279 1.241430e+03
## tree_cover_loss_ha
## 1 25.70585
## 2 280456.32110
## 3 1689.02503
## 4 3.42216
## 5 233919.67780
## 6 41.74732
data2 <- data1 %>%
mutate(prop_tree_loss = (tree_cover_loss_ha/area_sqkm)) %>%
mutate(prop_dalys = (dalys/population))
head(data2)
## country code area_sqkm population GDP_capita dalys
## 1 Afghanistan AFG 652860 36296111 519.8889 2.544699e+05
## 2 Angola AGO 1246700 29816769 4095.8101 8.006665e+05
## 3 Albania ALB 27400 2873457 4531.0208 1.543584e+02
## 4 Andorra AND 470 76997 38964.9045 3.942648e-01
## 5 Argentina ARG 2736690 44044811 14613.0418 2.894639e+04
## 6 Armenia ARM 28470 2944789 3914.5279 1.241430e+03
## tree_cover_loss_ha prop_tree_loss prop_dalys
## 1 25.70585 3.937421e-05 7.010940e-03
## 2 280456.32110 2.249589e-01 2.685289e-02
## 3 1689.02503 6.164325e-02 5.371872e-05
## 4 3.42216 7.281192e-03 5.120522e-06
## 5 233919.67780 8.547540e-02 6.572033e-04
## 6 41.74732 1.466362e-03 4.215684e-04
data3 <- data2 %>%
mutate(per_tree_loss = (prop_tree_loss * 100)) %>%
mutate(per_dalys = (prop_dalys * 100))
head(data3)
## country code area_sqkm population GDP_capita dalys
## 1 Afghanistan AFG 652860 36296111 519.8889 2.544699e+05
## 2 Angola AGO 1246700 29816769 4095.8101 8.006665e+05
## 3 Albania ALB 27400 2873457 4531.0208 1.543584e+02
## 4 Andorra AND 470 76997 38964.9045 3.942648e-01
## 5 Argentina ARG 2736690 44044811 14613.0418 2.894639e+04
## 6 Armenia ARM 28470 2944789 3914.5279 1.241430e+03
## tree_cover_loss_ha prop_tree_loss prop_dalys per_tree_loss per_dalys
## 1 25.70585 3.937421e-05 7.010940e-03 0.003937421 0.7010939935
## 2 280456.32110 2.249589e-01 2.685289e-02 22.495894850 2.6852892837
## 3 1689.02503 6.164325e-02 5.371872e-05 6.164324938 0.0053718717
## 4 3.42216 7.281192e-03 5.120522e-06 0.728119233 0.0005120522
## 5 233919.67780 8.547540e-02 6.572033e-04 8.547540196 0.0657203287
## 6 41.74732 1.466362e-03 4.215684e-04 0.146636171 0.0421568401
ggplot(data3, aes(sample = prop_dalys)) +
geom_qq() + geom_qq_line()
## Warning: Removed 15 rows containing non-finite values (stat_qq).
## Warning: Removed 15 rows containing non-finite values (stat_qq_line).
ggplot(data3, aes(sample = dalys)) +
geom_qq() + geom_qq_line()
## Warning: Removed 13 rows containing non-finite values (stat_qq).
## Warning: Removed 13 rows containing non-finite values (stat_qq_line).
data3 <- data3 %>%
mutate(logdalys = log(dalys))
ggplot(data3, aes(sample = logdalys)) +
geom_qq() + geom_qq_line()
## Warning: Removed 13 rows containing non-finite values (stat_qq).
## Warning: Removed 13 rows containing non-finite values (stat_qq_line).
data4 <- data3 %>%
mutate(log_dalys = log(prop_dalys))
head(data4)
## country code area_sqkm population GDP_capita dalys
## 1 Afghanistan AFG 652860 36296111 519.8889 2.544699e+05
## 2 Angola AGO 1246700 29816769 4095.8101 8.006665e+05
## 3 Albania ALB 27400 2873457 4531.0208 1.543584e+02
## 4 Andorra AND 470 76997 38964.9045 3.942648e-01
## 5 Argentina ARG 2736690 44044811 14613.0418 2.894639e+04
## 6 Armenia ARM 28470 2944789 3914.5279 1.241430e+03
## tree_cover_loss_ha prop_tree_loss prop_dalys per_tree_loss per_dalys
## 1 25.70585 3.937421e-05 7.010940e-03 0.003937421 0.7010939935
## 2 280456.32110 2.249589e-01 2.685289e-02 22.495894850 2.6852892837
## 3 1689.02503 6.164325e-02 5.371872e-05 6.164324938 0.0053718717
## 4 3.42216 7.281192e-03 5.120522e-06 0.728119233 0.0005120522
## 5 233919.67780 8.547540e-02 6.572033e-04 8.547540196 0.0657203287
## 6 41.74732 1.466362e-03 4.215684e-04 0.146636171 0.0421568401
## logdalys log_dalys
## 1 12.4469377 -4.960284
## 2 13.5931998 -3.617382
## 3 5.0392773 -9.831749
## 4 -0.9307325 -12.182254
## 5 10.2732009 -7.327517
## 6 7.1240192 -7.771529
ggplot(data4, aes(sample = log_dalys)) +
geom_qq() + geom_qq_line()
## Warning: Removed 15 rows containing non-finite values (stat_qq).
## Warning: Removed 15 rows containing non-finite values (stat_qq_line).
data4 <- data4 %>%
mutate(log_GDP = log(GDP_capita))
mod1 <- lm(logdalys ~ prop_tree_loss, data = data4)
summary(mod1)
##
## Call:
## lm(formula = logdalys ~ prop_tree_loss, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.059 -2.194 0.059 2.519 7.314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.1297 0.2579 35.397 <2e-16 ***
## prop_tree_loss -0.1450 0.1040 -1.395 0.165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.343 on 174 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.01105, Adjusted R-squared: 0.00537
## F-statistic: 1.945 on 1 and 174 DF, p-value: 0.1649
ggplot(data4, aes(y = log_dalys, x = prop_tree_loss)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
data4 <- data4 %>%
mutate(win_prop_tree = Winsorize(prop_tree_loss, minval = NULL, maxval = NULL, probs = c(0.00, 0.95), na.rm = TRUE, type = 9))
graph1 <- ggplot(data4, aes(y = log_dalys, x = win_prop_tree)) +
geom_point(color = 'darkblue', size = 3) +
stat_smooth(method = "lm", color = "red", fill = "#69b3a2", se = TRUE) +
labs(x = "Proportion of Forest Loss (ha)", y = "Proportion of DALYs (log)") +
ggtitle("Effect of Forest Loss on Disability Adjusted Life Years (DALYs) by country") +
theme_bw() +
theme(plot.title=element_text(hjust=0.5, vjust=0.5))
graph1
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
mod2 <- lm(logdalys ~ win_prop_tree, data = data4)
summary(mod2)
##
## Call:
## lm(formula = logdalys ~ win_prop_tree, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8125 -2.1737 -0.1597 2.6571 7.4237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.8773 0.3137 28.297 <2e-16 ***
## win_prop_tree 0.6133 0.6480 0.946 0.345
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.353 on 174 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.005121, Adjusted R-squared: -0.0005966
## F-statistic: 0.8957 on 1 and 174 DF, p-value: 0.3453
fire <- cal_palette(name = "fire", n = 15, type = "continuous")
superbloom <- cal_palette(name = "superbloom3", n = 15, type = "continuous")
kelp <- cal_palette(name = "kelp1", n = 15, type = "continuous")
kelp1 <- cal_palette(name = "kelp1", n = 2, type = "continuous")
graph2 <- ggplot(data4, aes(y = log_dalys, x = win_prop_tree, color = log_GDP)) +
geom_point(size = 3) +
stat_smooth(method = "lm", color = "red", fill = "#69b3a2", se = TRUE) +
labs(x = "Proportion of Forest Loss (ha)", y = "Proportion of DALYs (log)") +
ggtitle("Effect of Forest Loss on Disability Adjusted Life Years (DALYs) by Country") +
theme_bw()
graph2 +
guides(size = FALSE) +
labs(colour = "GDP per Capita") +
theme(legend.position = "bottom") +
theme(plot.title=element_text(hjust=0.5, vjust=0.5)) +
scale_color_gradientn(colours = kelp)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
mod3 <- lm(log_dalys ~ win_prop_tree*log_GDP, data = data4)
summary(mod3)
##
## Call:
## lm(formula = log_dalys ~ win_prop_tree * log_GDP, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1073 -0.9923 0.1520 0.9996 4.7735
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.46699 0.94271 4.738 4.55e-06 ***
## win_prop_tree 1.55754 1.92518 0.809 0.420
## log_GDP -1.33315 0.10841 -12.297 < 2e-16 ***
## win_prop_tree:log_GDP -0.09874 0.22240 -0.444 0.658
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.676 on 169 degrees of freedom
## (16 observations deleted due to missingness)
## Multiple R-squared: 0.5837, Adjusted R-squared: 0.5763
## F-statistic: 78.99 on 3 and 169 DF, p-value: < 2.2e-16
names(mod3)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "na.action" "xlevels" "call" "terms"
## [13] "model"
#ggplot(data4, aes(y = log_dalys, x = log_GDP, color = win_prop_tree)) +
#geom_point(aes(size = 0.5)) +
#stat_smooth(method = "lm", se = FALSE)
graph3 <- ggplot(data4, aes(y = log_dalys, x = log_GDP, color = log_GDP)) +
geom_point(size = 3) +
stat_smooth(method = "lm", color = "blue", fill = "#69b3a2", se = TRUE) +
labs(x = "GDP per capita", y = "Proportion of DALYs (log)") +
ggtitle("Effect of Economic Wealth on Disability Adjusted Life Years (DALYs) by country") +
theme_bw()
graph3 +
guides(size = FALSE) +
labs(colour = "GDP per Capita") +
theme(legend.position = "bottom") +
theme(plot.title=element_text(hjust=0.5, vjust=0.5)) +
scale_color_gradientn(colours = kelp)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).
graph4 <- ggplot(data4, aes(y = log_dalys, x = log_GDP, color = win_prop_tree)) +
geom_point(size = 3) +
stat_smooth(method = "lm", color = "blue", fill = "#69b3a2", se = TRUE) +
labs(x = "GDP per capita", y = "Proportion of DALYs (log)") +
ggtitle("Effect of Economic Wealth on Disability Adjusted Life Years (DALYs) by country") +
theme_bw()
graph4 +
guides(size = FALSE) +
labs(colour = "Proportion of Forest Loss") +
theme(legend.position = "bottom") +
theme(plot.title=element_text(hjust=0.5, vjust=0.5)) +
scale_color_gradientn(colours = superbloom)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).
data5 <- data4 %>%
mutate(country_income_level =
case_when(GDP_capita >= 13000 ~ "high",
GDP_capita >= 4000 ~ "middle",
GDP_capita <= 4000 ~ "low"))
data5 %>%
group_by(country_income_level) %>%
summarize(n = n())
## # A tibble: 4 × 2
## country_income_level n
## <chr> <int>
## 1 high 52
## 2 low 72
## 3 middle 51
## 4 <NA> 14
equLOWcountries <- c("Burundi", "Benin", "Bhangladesh", "Bolivia", "Central African Republic", "Cote d'Ivoire", "Cameroon", "Democratic Republic of Congo", "Congo", "Comoros", "Ethiopia", "Ghana", "Guinea", "Gambia", "Honduras", "Haiti", "Indonesia", "Kenya", "Cambodia", "Laos", "Liberia", "Madagascar", "Mali", "Myanmar", "Mozambique",
"Mauritania", "Malawi", "Nigeria", "Nicaragua", "Phillipines", "Rwanda", "Senegal", "Sierra Leone", "El Salvador", "Eswatini", "Togo", "Tanzania", "Uganda", "Vietnam", "Zambia", "Zimbabwe")
noneqLOWcountries <- c("AFG", "ARM", "BTN", "CPV", "EGY", "ERI", "FSM", "GNB", "IND", "KGZ", "LSO", "MAR", "MDA", "MNG", "NER", "NPL", "PAK", "PNG", "SDN", "SLB", "SSD", "STP", "SYR", "TCD", "TJK", "TLS", "TUN", "UKR", "UZB", "VUT")
data6 <- data5 %>%
filter(country_income_level == "low") %>%
filter(!(code %in% c(noneqLOWcountries)))
mod3 <- lm(log_dalys ~ win_prop_tree + log_GDP, data = data4)
mod4 <- lm(log_dalys ~ win_prop_tree * GDP_capita, data = data6)
summary(mod4)
##
## Call:
## lm(formula = log_dalys ~ win_prop_tree * GDP_capita, data = data6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.90404 -0.93667 0.00806 0.77030 1.99482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.2146577 0.4261939 -7.543 4.59e-09 ***
## win_prop_tree 1.0140100 0.7203759 1.408 0.1674
## GDP_capita -0.0005546 0.0002762 -2.008 0.0518 .
## win_prop_tree:GDP_capita -0.0007232 0.0004452 -1.625 0.1125
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.075 on 38 degrees of freedom
## Multiple R-squared: 0.4589, Adjusted R-squared: 0.4162
## F-statistic: 10.74 on 3 and 38 DF, p-value: 2.989e-05
mod5 <- lm(log_dalys ~ win_prop_tree * GDP_capita, data = data6)
summary(mod5)
##
## Call:
## lm(formula = log_dalys ~ win_prop_tree * GDP_capita, data = data6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.90404 -0.93667 0.00806 0.77030 1.99482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.2146577 0.4261939 -7.543 4.59e-09 ***
## win_prop_tree 1.0140100 0.7203759 1.408 0.1674
## GDP_capita -0.0005546 0.0002762 -2.008 0.0518 .
## win_prop_tree:GDP_capita -0.0007232 0.0004452 -1.625 0.1125
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.075 on 38 degrees of freedom
## Multiple R-squared: 0.4589, Adjusted R-squared: 0.4162
## F-statistic: 10.74 on 3 and 38 DF, p-value: 2.989e-05
#ggplot(data6, aes(y = log_dalys, x = win_prop_tree)) +
#geom_point(aes(color = GDP_capita, size = 1.0)) +
#stat_smooth(method = "lm", se = FALSE)
graph5 <- ggplot(data6, aes(y = log_dalys, x = win_prop_tree, color = GDP_capita)) +
geom_point(size = 4) +
stat_smooth(method = "lm", color = "red", fill = "#69b3a2", se = TRUE) +
labs(x = "Proportion of Forest Loss (ha)", y = "Proportion of DALYs (log)") +
ggtitle("Effect of Forest Loss on Disability Adjusted Life Years (DALYs) \n for Low Income, Tropical Nations") +
theme_bw()
graph5 +
guides(size = FALSE) +
labs(colour = "GDP per Capita") +
theme(legend.position = "bottom") +
theme(plot.title=element_text(hjust=0.5, vjust=0.5)) +
scale_color_gradientn(colours = kelp)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'
#ggplot(data6, aes(y = log_dalys, x = log_GDP)) +
#geom_point(aes(color = win_prop_tree, size = 1.0)) +
#stat_smooth(method = "lm", se = FALSE)
graph6 <- ggplot(data6, aes(y = log_dalys, x = GDP_capita, color = win_prop_tree)) +
geom_point(size = 4) +
stat_smooth(method = "lm", color = "red", fill = "#69b3a2", se = TRUE) +
labs(x = "GDP per capita", y = "Proportion of DALYs (log)") +
ggtitle("Effect of Economic Wealth on Disability Adjusted Life Years (DALYs) \n for Low Income, Tropical Nations") +
theme_bw()
graph6 +
guides(size = FALSE) +
labs(colour = "Proportion of Forest Loss") +
theme(legend.position = "bottom") +
theme(plot.title=element_text(hjust=0.5, vjust=0.5)) +
scale_color_gradientn(colours = superbloom)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'
mod5 <- lm(log_dalys ~ win_prop_tree * GDP_capita, data = data6)
summary(mod5)
##
## Call:
## lm(formula = log_dalys ~ win_prop_tree * GDP_capita, data = data6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.90404 -0.93667 0.00806 0.77030 1.99482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.2146577 0.4261939 -7.543 4.59e-09 ***
## win_prop_tree 1.0140100 0.7203759 1.408 0.1674
## GDP_capita -0.0005546 0.0002762 -2.008 0.0518 .
## win_prop_tree:GDP_capita -0.0007232 0.0004452 -1.625 0.1125
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.075 on 38 degrees of freedom
## Multiple R-squared: 0.4589, Adjusted R-squared: 0.4162
## F-statistic: 10.74 on 3 and 38 DF, p-value: 2.989e-05
data6 %>%
arrange(desc(population)) %>%
mutate(country = factor(country, country)) %>%
ggplot(aes(x=GDP_capita, y=log_dalys, size=per_tree_loss, fill= GDP_capita)) +
stat_smooth(method = "lm", color = "black", se = FALSE) +
geom_point(alpha=0.5, shape=21, color="black") +
scale_size(range = c(1.4, 19), name="Population (M)") +
scale_fill_viridis(discrete=FALSE, guide=FALSE, option="A") +
theme_bw() +
theme(legend.position="bottom") +
ylab("DALYs (log)") +
xlab("Gdp per Capita") +
theme(legend.position = "none")
## `geom_smooth()` using formula 'y ~ x'
p <- data6 %>%
# Reorder countries to having big bubbles on top
arrange(desc(per_tree_loss)) %>%
mutate(country = factor(country, country)) %>%
# prepare text for tooltip
mutate(text = paste("Country: ", country, "\nPopulation (M): ", population, "\n% DALYs of Population: ", per_dalys, "\nGdp per capita: ", GDP_capita, "\n % Tree Loss of Country: ", per_tree_loss, sep="")) %>%
# Classic ggplot
ggplot( aes(x=GDP_capita, y=log_dalys, size = per_tree_loss, color = GDP_capita, text=text)) +
geom_point(alpha=0.7) +
scale_size(range = c(1.4, 19), name="% Tree Loss") +
scale_color_viridis(discrete=FALSE, guide=FALSE, option = "A") +
theme_bw() +
theme(legend.position="none")
# turn ggplot interactive with plotly
pp <- ggplotly(p, tooltip="text")
pp
Hypothesis Testing and OLS violations Does deforestation contribute to the incidence and burden of neglected tropical diseases within countries?
H0: There is no effect of deforestation and NTD burden. B1 = 0 HA: There will be a positive effect between deforestation and NTD burden. A country with high forest loss will have a high NTD burden whereas a country with low forest loss will have low NTD burden B1 =/ 0
Predictions: I expect to fail to reject the null, this particularly data set is unlikely to capture the intricacies of NTD prevalence and estimation in relation to deforestation.
lm(gdp_growth ~ number + avg_strength, data = hgdp) %>% summary() %>% xtable() %>% kable(caption = “…”)
mod2 <- lm(logdalys ~ win_prop_tree, data = data4)
summary(mod2)
##
## Call:
## lm(formula = logdalys ~ win_prop_tree, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8125 -2.1737 -0.1597 2.6571 7.4237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.8773 0.3137 28.297 <2e-16 ***
## win_prop_tree 0.6133 0.6480 0.946 0.345
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.353 on 174 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.005121, Adjusted R-squared: -0.0005966
## F-statistic: 0.8957 on 1 and 174 DF, p-value: 0.3453
Point Estimates: B0 = 8.8773 B1 = 0.6133
Std. Error: 0.6480
t-statistic: 0.946
p-value: 0.345
R-squared: 0.005121
OLS Assumptions - unbiased and lowest variance correlation: I’m not sure if you really need this with the regression, it’s just another way of showing that this relationship is “insignificant”
0.108 (zero correlation)
#data4 %>%
#summarize(dalys_cor = cor(log_dalys, win_prop_tree, use = "complete.obs")) %>%
#mean(predictions$residuals, na.rm = TRUE)
mod2 <- lm(logdalys ~ win_prop_tree, data = data4)
predictions <- data4 %>% add_predictions(mod2) %>%
mutate(residuals = log_dalys-pred)
#ggplot(data=predictions) + geom_histogram(aes(residuals), bins=80) +
# theme_bw()
ggplot(predictions, aes(residuals)) +
geom_histogram(color = "dark blue", fill = "dark blue", bins = 80) +
geom_density(alpha = 0.2, fill="lightblue") +
theme_bw()
## Warning: Removed 15 rows containing non-finite values (stat_bin).
## Warning: Removed 15 rows containing non-finite values (stat_density).
ggplot(predictions) +
geom_point(aes(x=win_prop_tree, y=residuals), color = "dark blue", size = 3, alpha = .5) +
theme_bw()
## Warning: Removed 15 rows containing missing values (geom_point).
Hypothesis Testing and OLS violations - adding income to model Does deforestation affect the incidence and burden of neglected tropical diseases within countries when the model contributes for the affect of country economic wealth?
H0: There is no effect of deforestation and NTD burden. B1 = 0 HA: There will be a positive effect between deforestation and NTD burden. Low economic, tropical countries with high deforestation rates will have a high incidence of DALYs per their populations. B1 =/ 0
Predictions: I expect to fail to reject the null, even this more focused subset of the data set is unlikely to capture the intricacies of NTD prevalence and estimation in relation to deforestation.
mod3 <- lm(log_dalys ~ win_prop_tree * GDP_capita, data = data4)
summary(mod3)
##
## Call:
## lm(formula = log_dalys ~ win_prop_tree * GDP_capita, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1610 -1.2681 0.0039 1.1068 5.2214
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.960e+00 2.289e-01 -26.034 < 2e-16 ***
## win_prop_tree 1.094e+00 5.146e-01 2.125 0.035 *
## GDP_capita -7.401e-05 9.474e-06 -7.812 5.68e-13 ***
## win_prop_tree:GDP_capita -4.193e-05 2.700e-05 -1.553 0.122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.005 on 169 degrees of freedom
## (16 observations deleted due to missingness)
## Multiple R-squared: 0.4041, Adjusted R-squared: 0.3936
## F-statistic: 38.21 on 3 and 169 DF, p-value: < 2.2e-16
Point Estimates: B0 = -5.96 B1 = 1.094
Std. Error: 5.146e-01
t-statistic: 2.125
p-value: 0.035 *
R-squared: 0.3936
OLS Assumptions - unbiased and lowest variance
predictions2 <- data4 %>% add_predictions(mod3) %>%
mutate(residuals = log_dalys-pred)
#ggplot(data=predictions2) + geom_histogram(aes(residuals), bins=80)
ggplot(predictions2, aes(residuals)) +
geom_histogram(color = "dark blue", fill = "dark blue", bins = 80) +
geom_density(alpha = 0.2, fill="lightblue") +
theme_bw()
## Warning: Removed 16 rows containing non-finite values (stat_bin).
## Warning: Removed 16 rows containing non-finite values (stat_density).
mean(predictions2$residuals, na.rm = TRUE)
## [1] -4.153504e-15
#ggplot(predictions) + geom_point(aes(x=GDP_capita, y=residuals))
ggplot(predictions2) +
geom_point(aes(x=win_prop_tree, y=residuals), color = "dark blue", size = 3, alpha = .5) +
theme_bw()
## Warning: Removed 16 rows containing missing values (geom_point).
Hypothesis Testing and OLS Assumptions for Low Income & Tropical Countries Does deforestation affect the incidence and burden of neglected tropical diseases within countries that a geographically tropical and disadvantaged on the global economic spectrum?
H0: There is no effect of deforestation and NTD burden. B1 = 0 HA: There will be a positive effect between deforestation and NTD burden. Low economic, tropical countries with high deforestation rates will have a high incidence of DALYs per their populations. B1 =/ 0
Predictions: I expect to fail to reject the null, even this more focused subset of the data set is unlikely to capture the intricacies of NTD prevalence and estimation in relation to deforestation.
mod5 <- lm(log_dalys ~ win_prop_tree * GDP_capita, data = data6)
summary(mod5)
##
## Call:
## lm(formula = log_dalys ~ win_prop_tree * GDP_capita, data = data6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.90404 -0.93667 0.00806 0.77030 1.99482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.2146577 0.4261939 -7.543 4.59e-09 ***
## win_prop_tree 1.0140100 0.7203759 1.408 0.1674
## GDP_capita -0.0005546 0.0002762 -2.008 0.0518 .
## win_prop_tree:GDP_capita -0.0007232 0.0004452 -1.625 0.1125
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.075 on 38 degrees of freedom
## Multiple R-squared: 0.4589, Adjusted R-squared: 0.4162
## F-statistic: 10.74 on 3 and 38 DF, p-value: 2.989e-05
Point Estimates: B0 = -3.2146577 B1 = 1.0140100
Std. Error: 0.7203759
t-statistic: 1.408
p-value: 0.1674
R-squared: 0.4162
OLS Assumptions - unbiased and lowest variance 1. ‘yes’ to an extent obviously hugely variable 2. Can not test but has likely in this case been violated (diseases are hardly ever explained by just a couple). Go more into this… 3. Yes 4. Mean Residuals:-4.018415e-16, this is the least unbiased of the three models
predictions3 <- data6 %>% add_predictions(mod5) %>%
mutate(residuals = log_dalys-pred)
#ggplot(data=predictions2) + geom_histogram(aes(residuals), bins=80)
ggplot(predictions3, aes(residuals)) +
geom_histogram(color = "dark blue", fill = "dark blue", bins = 80) +
geom_density(alpha = 0.2, fill="lightblue") +
theme_bw()
mean(predictions3$residuals, na.rm = TRUE)
## [1] -4.018415e-16
#ggplot(predictions) + geom_point(aes(x=GDP_capita, y=residuals))
ggplot(predictions3) +
geom_point(aes(x=win_prop_tree, y=residuals), color = "dark blue", size = 3, alpha = .5) +
theme_bw()